import requests
from bs4 import BeautifulSoup
import re
import time
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
total=[]
def get_details(url):
try:
res=requests.get(url,headers=headers,timeout=20)
soup=BeautifulSoup(res.text,'html.parser')
#一页的标题
titles=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.prop-title > a')
titless=list(map(lambda x:x.text.strip(),titles))
#一页的地址
addres=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.info-table > div > span.info-col.row2-text > a:nth-of-type(2)')
address=list(map(lambda x:x.text.strip(),addres))
#一页几室几厅
shitings=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.info-table > div:nth-of-type(1) > span')
shitingss=list(map(lambda x:x.text.split(' ')[0].strip(),shitings))
#面积/平方米
mianjis=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.info-table > div:nth-of-type(1) > span')
mianjiss=list(map(lambda x:re.findall('\|(.*?)平',x.text),mianjis))
mianjiss_int=[float(a.strip()) for i in mianjiss for a in i]
#房屋平方
pings=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.info-table > div:nth-of-type(2) > span.info-col.price-item.minor')
pingss=list(map(lambda x:re.findall('价(.+?)元',x.text),pings))
pingss_int=[int(a) for i in pingss for a in i]
#房屋单价元/平方
prices=soup.select('#js-ershoufangList > div.content-wrapper > div.content > div.m-list > ul > li > div > div.info-table > div:nth-of-type(1) > div > span.total-price.strong-num')
pricess=list(map(lambda x:x.text.strip(),prices))
pricess_int=[int(i) for i in pricess]
for title,addre,shiting,mianji,ping,price in zip(titless,address,shitingss,mianjiss_int,pingss_int,pricess_int):
total.append({'标题':title,
'地区':addre,
'室厅':shiting,
'面积(平方)':mianji,
'元每平方':ping,
'价格(万)':price})
except:
print('抓取失败')
return total
if __name__=='__main__':
for i in range(1,41):
URL='http://sh.lianjia.com/ershoufang/d'+str(i)
get_details(URL)
print('第{}页抓取完毕'.format(i))
time.sleep(5)
import pandas as pd
df=pd.DataFrame(total)
df.to_excel('lianjia-3.xls')
python爬取链家网40页二手房信息
最新推荐文章于 2024-08-13 21:38:27 发布